建立随机模型,replace=T表示有放回,prob是概率比例。
population.values <- 1:3
probs <- c(5,2,3)
my.draw <- sample (population.values, 100000, probs, replace=TRUE)
table(my.draw)
## my.draw
## 1 2 3
## 50080 20020 29900
replicate生成多组数据,下面是无放回的,所以相当于随机排序。
sample(1:6)
## [1] 6 2 4 1 5 3
replicate(3,sample(c("Curly","Larry","Moe","Shemp")))
## [,1] [,2] [,3]
## [1,] "Moe" "Shemp" "Shemp"
## [2,] "Curly" "Moe" "Curly"
## [3,] "Shemp" "Curly" "Larry"
## [4,] "Larry" "Larry" "Moe"
bootstrp.resample来生成重抽样(replace=T)
bootstrap.resample <- function (object){
sample (object, length(object), replace=TRUE)}
replicate(5, bootstrap.resample (6:10))
## [,1] [,2] [,3] [,4] [,5]
## [1,] 9 10 9 7 9
## [2,] 7 8 7 9 9
## [3,] 7 6 8 7 8
## [4,] 8 8 9 10 7
## [5,] 6 7 10 8 10
对cat数据集进行重抽样看均值差异。每次运行结果会不一样。
diff.in.means <- function(df) {
mean(df[df$Sex=="M","Hwt"]) - mean(df[df$Sex=="F","Hwt"])
}
resample.diffs <- replicate(1000,
diff.in.means(cats[bootstrap.resample(1:nrow(cats)),]))
得到的图像基本是正态的。
tibble(d = resample.diffs) %>% ggplot(mapping = aes(x = d))+
geom_histogram(bins = 25)+
geom_vline(aes(xintercept = diff.in.means(cats)), col='red')
quantile(resample.diffs,prob=c(0.025,0.975))
## 2.5% 97.5%
## 1.488902 2.733772
qnorm(c(0.025,0.975),mean=mean(resample.diffs),
sd=sd(resample.diffs))
## [1] 1.461083 2.762555
#Markov链 天气与前一天的晴雨有关,但与再之前的无关,是Markov。
sunny.year <- rep(NA, 365)
sunny.year[1] <- 1
for (day in 2:365) {
sunny.year[day] <- rbinom(1,1,0.2 + 0.6*sunny.year[day-1])}
tibble(x = 1:365, y = sunny.year) %>% ggplot(aes(x = x, y = y))+
geom_line()+
labs(title="Sunny Days in An Equatorial City", x="Day", y="Sunshine?")
若天气与之前无关,即晴雨概率相等,则有下图,明显不同。
boring.year <- tibble(day = 1:365, rain = rbinom(365, 1, 0.5))
boring.year %>% ggplot(aes(x = day, y = rain))+
geom_line()+
labs(title="Sunny Days in A Coin Flip City", x="Day", ylab="Sunshine?")
奇奇怪怪年,下一天和上一天相反概率大。
weird.year <- rep(NA, 365)
weird.year[1] <- 1
transition.matrix <- matrix (c(0.2, 0.8, 0.8, 0.2), nrow=2)
for (day in 2:365) weird.year[day] <-
sample(1:2, 1, prob=transition.matrix[weird.year[day-1],])
tibble(day = 1:365, rain = weird.year) %>%
ggplot(aes(x = day, y = rain))+
geom_line()+
labs(title="Sunny Days in Al Gore's Nightmare", x="Day", y="Sunshine?")
##标准Markov函数的形式,transition.matrix是转移矩阵。
rmarkovchain <- function (nn, transition.matrix,
start=sample(1:nrow(transition.matrix), 1)) {
output <- rep (NA, nn)
output[1] <- start
for (day in 2:nn) {
output[day] <-
sample(ncol(transition.matrix), 1,
prob=transition.matrix[output[day-1],])}
}
一个简单的Markov链:随机游走(output记录醉汉的位置)
randomwalk <- function (nn, upprob=0.5, start=50) {
output <- rep (NA, nn)
output[1] <- start
for (iteration in 2:nn)
output[iteration] <-
output[iteration-1] + 2*rbinom(1, 1, upprob) - 1
output
}
tibble(x = 1:10000, y = randomwalk(10000, start=200)) %>%
ggplot(aes(x, y))+ geom_line()+
labs(title="Simple Random Walk")
用随机变量计算积分,由
\[\frac{1}{n}\sum_{i=1}^{n}{f(X_i)} \rightarrow \mathbb{E}_{p}[f(X)] = \int{f(x) p(x) dx}\]
若要求\(\int g(x)dx\),则取\(f(x)=g(x)/p(x)\)。
x1 <- runif(300, 0, 1); y1 <- runif(300, 0, 2.6);
selected <- y1 < dbeta(x1, 3, 6)
选出来落在dbeta之内的即可估算面积。
利用选出的点进行分布计算。
mean(selected)
## [1] 0.3266667
accepted.points <- x1[selected]
mean(accepted.points < 0.5)
## [1] 0.877551
pbeta(0.5, 3, 6)
## [1] 0.8554688
metropolis.one <- function (xx, fun=dpois, ...) {
prop <- 2*rbinom(1,1,0.5) - 1 + xx#随机游动
acc.ratio <- fun(prop, ...)/fun(xx, ...)#fun是个函数
output <- if (acc.ratio > runif(1)) prop else xx
output
}
replicate (10, metropolis.one(10, lambda=5))
## [1] 10 10 9 10 9 9 9 10 11 9
start <- 50
draws <- rep(NA, 10000)
for (iteration in 1:10000) {
start <- metropolis.one (start, lambda=10)
draws[iteration] <- start
}
tibble(n = 1:10000, draws = draws) %>%
ggplot(aes(n, draws))+ geom_point()